import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O
import pandas as pd # use for data manipulation and analysis
import numpy as np # use for multi-dimensional array and matrix
import seaborn as sns # use for high-level interface for drawing attractive and informative statistical graphics
import matplotlib.pyplot as plt # It provides an object-oriented API for embedding plots into applications
%matplotlib inline
# It sets the backend of matplotlib to the 'inline' backend:
import plotly.express as px
from sklearn.linear_model import LogisticRegression # algo use to predict good or bad
from sklearn.naive_bayes import MultinomialNB # nlp algo use to predict good or bad
from sklearn.model_selection import train_test_split # spliting the data between feature and target
from sklearn.metrics import classification_report # gives whole report about metrics (e.g, recall,precision,f1_score,c_m)
from sklearn.metrics import confusion_matrix # gives info about actual and predict
from nltk.tokenize import RegexpTokenizer # regexp tokenizers use to split words from text
from nltk.stem.snowball import SnowballStemmer # stemmes words
from sklearn.feature_extraction.text import CountVectorizer # create sparse matrix of words using regexptokenizes
from sklearn.pipeline import make_pipeline # use for combining all prerocessors techniuqes and algos
from PIL import Image # getting images in notebook
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator# creates words colud
from bs4 import BeautifulSoup # use for scraping the data from website
import networkx as nx # for the creation, manipulation, and study of the structure, dynamics, and functions of complex networks.
import pickle # use to save model
import warnings # ignores pink warnings
warnings.filterwarnings('ignore')
# Loading the dataset
df = pd.read_csv('phishing data/phishing_site_urls.csv')
df.head()
| URL | Label | |
|---|---|---|
| 0 | nobell.it/70ffb52d079109dca5664cce6f317373782/... | bad |
| 1 | www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc... | bad |
| 2 | serviciosbys.com/paypal.cgi.bin.get-into.herf.... | bad |
| 3 | mail.printakid.com/www.online.americanexpress.... | bad |
| 4 | thewhiskeydregs.com/wp-content/themes/widescre... | bad |
df.tail()
| URL | Label | |
|---|---|---|
| 549341 | 23.227.196.215/ | bad |
| 549342 | apple-checker.org/ | bad |
| 549343 | apple-iclods.org/ | bad |
| 549344 | apple-uptoday.org/ | bad |
| 549345 | apple-search.info | bad |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 549346 entries, 0 to 549345 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 URL 549346 non-null object 1 Label 549346 non-null object dtypes: object(2) memory usage: 8.4+ MB
df.isnull().sum() # there is no missing values
URL 0 Label 0 dtype: int64
#create a dataframe of classes counts
label_counts = pd.DataFrame(df.Label.value_counts())
#visualizing target_col
fig = px.bar(label_counts, x=label_counts.index, y=label_counts.Label)
fig.show()
tokenizer = RegexpTokenizer(r'[A-Za-z]+')
df.URL[0]
'nobell.it/70ffb52d079109dca5664cce6f317373782/login.SkyPe.com/en/cgi-bin/verification/login/70ffb52d079109dca5664cce6f317373/index.php?cmd=_profile-ach&outdated_page_tmpl=p/gen/failed-to-load&nav=0.5.1&login_access=1322408526'
# this will be pull letter which matches to expression
np.array(tokenizer.tokenize(df.URL[0])) # using first row
array(['nobell', 'it', 'ffb', 'd', 'dca', 'cce', 'f', 'login', 'SkyPe',
'com', 'en', 'cgi', 'bin', 'verification', 'login', 'ffb', 'd',
'dca', 'cce', 'f', 'index', 'php', 'cmd', 'profile', 'ach',
'outdated', 'page', 'tmpl', 'p', 'gen', 'failed', 'to', 'load',
'nav', 'login', 'access'], dtype='<U12')
df['text_tokenized'] = df.URL.map(lambda t: tokenizer.tokenize(t)) # doing with all rows
df.sample(3)
| URL | Label | text_tokenized | |
|---|---|---|---|
| 326935 | facebook.com/defyfc | good | [facebook, com, defyfc] |
| 354109 | hoovers.com/company/Provigo_Inc/hcrtif-1.html | good | [hoovers, com, company, Provigo, Inc, hcrtif, ... |
| 289996 | beenverified.com/p/rick+rodriguez | good | [beenverified, com, p, rick, rodriguez] |
stemmer = SnowballStemmer("english") # choose a language
df['text_stemmed'] = df['text_tokenized'].map(lambda l: [stemmer.stem(word) for word in l])
df.sample(5)
| URL | Label | text_tokenized | text_stemmed | |
|---|---|---|---|---|
| 477395 | youtube.com/watch?v=oJVsM9ZpsfI | good | [youtube, com, watch, v, oJVsM, ZpsfI] | [youtub, com, watch, v, ojvsm, zpsfi] |
| 271988 | allmusic.com/album/fijacin-oral-vol-1-r744021 | good | [allmusic, com, album, fijacin, oral, vol, r] | [allmus, com, album, fijacin, oral, vol, r] |
| 207698 | library.utah.gov/ | good | [library, utah, gov] | [librari, utah, gov] |
| 131875 | account-service-update.com/Login/update/ | bad | [account, service, update, com, Login, update] | [account, servic, updat, com, login, updat] |
| 360971 | impawards.com/2007/eastern_promises.html | good | [impawards, com, eastern, promises, html] | [impaward, com, eastern, promis, html] |
df['text_sent'] = df['text_stemmed'].map(lambda l: ' '.join(l))
df.sample(2)
| URL | Label | text_tokenized | text_stemmed | text_sent | |
|---|---|---|---|---|---|
| 115355 | ajempi.com.br/2013/administrator/components/co... | bad | [ajempi, com, br, administrator, components, c... | [ajempi, com, br, administr, compon, com, conf... | ajempi com br administr compon com config life... |
| 68006 | tools.ietf.org/html/rfc2121 | good | [tools, ietf, org, html, rfc] | [tool, ietf, org, html, rfc] | tool ietf org html rfc |
1. Visualize some important keys using word cloud
#sliceing classes
bad_sites = df[df.Label == 'bad']
good_sites = df[df.Label == 'good']
bad_sites.head(2)
| URL | Label | text_tokenized | text_stemmed | text_sent | |
|---|---|---|---|---|---|
| 0 | nobell.it/70ffb52d079109dca5664cce6f317373782/... | bad | [nobell, it, ffb, d, dca, cce, f, login, SkyPe... | [nobel, it, ffb, d, dca, cce, f, login, skype,... | nobel it ffb d dca cce f login skype com en cg... |
| 1 | www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc... | bad | [www, dghjdgf, com, paypal, co, uk, cycgi, bin... | [www, dghjdgf, com, paypal, co, uk, cycgi, bin... | www dghjdgf com paypal co uk cycgi bin webscrc... |
good_sites.head(2)
| URL | Label | text_tokenized | text_stemmed | text_sent | |
|---|---|---|---|---|---|
| 18231 | esxcc.com/js/index.htm?us.battle.net/noghn/en/... | good | [esxcc, com, js, index, htm, us, battle, net, ... | [esxcc, com, js, index, htm, us, battl, net, n... | esxcc com js index htm us battl net noghn en r... |
| 18232 | wwweira¯&nvinip¿ncH¯wVö%ÆåyDaHðû/ÏyEùuË\nÓ6... | good | [www, eira, nvinip, ncH, wV, yDaH, yE, u, rT, ... | [www, eira, nvinip, nch, wv, ydah, ye, u, rt, ... | www eira nvinip nch wv ydah ye u rt u g m i xz... |
def plot_wordcloud(text, mask=None, max_words=400, max_font_size=120, figure_size=(20,12),
title = None, title_size=30, image_color=False):
stopwords = set(STOPWORDS)
more_stopwords = {'com','http'}
stopwords = stopwords.union(more_stopwords)
wordcloud = WordCloud(background_color='white',
stopwords = stopwords,
max_words = max_words,
max_font_size = max_font_size,
random_state = 42,
mask = mask)
wordcloud.generate(text)
print(wordcloud.generate(text))
plt.figure(figsize=figure_size)
if image_color:
image_colors = ImageColorGenerator(mask);
plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation="bilinear");
plt.title(title, fontdict={'size': title_size,
'verticalalignment': 'bottom'})
else:
plt.imshow(wordcloud);
plt.title(title, fontdict={'size': title_size, 'color': 'green',
'verticalalignment': 'bottom'})
plt.axis('off');
plt.show()
# d = '../input/masks/masks-wordclouds/'
data = good_sites.text_sent
data.reset_index(drop=True, inplace=True)
common_text = str(data)
plot_wordcloud(common_text, max_words=50, max_font_size=80,
title = 'Most common words use in good urls', title_size=15)
<wordcloud.wordcloud.WordCloud object at 0x0000028F0E638850>
data = bad_sites.text_stemmed
data.reset_index(drop=True, inplace=True)
common_text = str(data)
plot_wordcloud(common_text, max_words=50, max_font_size=80,
title = 'Most common words use in bad urls', title_size=15)
<wordcloud.wordcloud.WordCloud object at 0x0000028EB24767C0>
#create cv object
cv = CountVectorizer()
feature = cv.fit_transform(df.text_sent) #transform all text which we tokenize and stemed
feature[:5].toarray() # convert sparse matrix into array to print transformed features
array([[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0]], dtype=int64)
trainX, testX, trainY, testY = train_test_split(feature, df.Label)
# create lr object
lr= LogisticRegression()
lr.fit(trainX,trainY)
lr.score(testX,testY)
0.964233964627158
.* Logistic Regression is giving 96% accuracy, Now we will store scores in dict to see which model perform best
Scores_ml = {}
Scores_ml['Logistic Regression'] = np.round(lr.score(testX,testY),2)
print('Training Accuracy :',lr.score(trainX,trainY))
print('Testing Accuracy :',lr.score(testX,testY))
con_mat = pd.DataFrame(confusion_matrix(lr.predict(testX), testY),
columns = ['Predicted:Bad', 'Predicted:Good'],
index = ['Actual:Bad', 'Actual:Good'])
print('\nCLASSIFICATION REPORT\n')
print(classification_report(lr.predict(testX), testY,
target_names =['Bad','Good']))
print('\nCONFUSION MATRIX')
plt.figure(figsize= (6,4))
sns.heatmap(con_mat, annot = True,fmt='d',cmap="YlGnBu")
Training Accuracy : 0.979133950957382
Testing Accuracy : 0.964233964627158
CLASSIFICATION REPORT
precision recall f1-score support
Bad 0.91 0.97 0.94 36865
Good 0.99 0.96 0.98 100472
accuracy 0.96 137337
macro avg 0.95 0.96 0.96 137337
weighted avg 0.97 0.96 0.96 137337
CONFUSION MATRIX
<AxesSubplot:>
# create mnb object
mnb = MultinomialNB()
mnb.fit(trainX,trainY)
MultinomialNB()
mnb.score(testX,testY)
0.9578482127904352
* MultinomialNB gives us 95% accuracy
Scores_ml['MultinomialNB'] = np.round(mnb.score(testX,testY),2)
print('Training Accuracy :',mnb.score(trainX,trainY))
print('Testing Accuracy :',mnb.score(testX,testY))
con_mat = pd.DataFrame(confusion_matrix(mnb.predict(testX), testY),
columns = ['Predicted:Bad', 'Predicted:Good'],
index = ['Actual:Bad', 'Actual:Good'])
print('\nCLASSIFICATION REPORT\n')
print(classification_report(mnb.predict(testX), testY,
target_names =['Bad','Good']))
print('\nCONFUSION MATRIX')
plt.figure(figsize= (6,4))
sns.heatmap(con_mat, annot = True,fmt='d',cmap="YlGnBu")
Training Accuracy : 0.9740491105776815
Testing Accuracy : 0.9578482127904352
CLASSIFICATION REPORT
precision recall f1-score support
Bad 0.92 0.94 0.93 38426
Good 0.97 0.97 0.97 98911
accuracy 0.96 137337
macro avg 0.95 0.95 0.95 137337
weighted avg 0.96 0.96 0.96 137337
CONFUSION MATRIX
<AxesSubplot:>
acc = pd.DataFrame.from_dict(Scores_ml,orient = 'index',columns=['Accuracy'])
sns.set_style('darkgrid')
sns.barplot(acc.index,acc.Accuracy)
<AxesSubplot:ylabel='Accuracy'>
* So, Logistic Regression is the best fit model, Now we make sklearn pipeline using Logistic Regression
pipeline_ls = make_pipeline(CountVectorizer(tokenizer = RegexpTokenizer(r'[A-Za-z]+').tokenize,stop_words='english'), LogisticRegression())
##(r'\b(?:http|ftp)s?://\S*\w|\w+|[^\w\s]+') ([a-zA-Z]+)([0-9]+) -- these tolenizers giving me low accuray
trainX, testX, trainY, testY = train_test_split(df.URL, df.Label)
pipeline_ls.fit(trainX,trainY)
Pipeline(steps=[('countvectorizer',
CountVectorizer(stop_words='english',
tokenizer=<bound method RegexpTokenizer.tokenize of RegexpTokenizer(pattern='[A-Za-z]+', gaps=False, discard_empty=True, flags=re.UNICODE|re.MULTILINE|re.DOTALL)>)),
('logisticregression', LogisticRegression())])
pipeline_ls.score(testX,testY)
0.9651659785782418
print('Training Accuracy :',pipeline_ls.score(trainX,trainY))
print('Testing Accuracy :',pipeline_ls.score(testX,testY))
con_mat = pd.DataFrame(confusion_matrix(pipeline_ls.predict(testX), testY),
columns = ['Predicted:Bad', 'Predicted:Good'],
index = ['Actual:Bad', 'Actual:Good'])
print('\nCLASSIFICATION REPORT\n')
print(classification_report(pipeline_ls.predict(testX), testY,
target_names =['Bad','Good']))
print('\nCONFUSION MATRIX')
plt.figure(figsize= (6,4))
sns.heatmap(con_mat, annot = True,fmt='d',cmap="YlGnBu")
Training Accuracy : 0.9791096796429204
Testing Accuracy : 0.9651659785782418
CLASSIFICATION REPORT
precision recall f1-score support
Bad 0.91 0.97 0.94 36748
Good 0.99 0.96 0.98 100589
accuracy 0.97 137337
macro avg 0.95 0.97 0.96 137337
weighted avg 0.97 0.97 0.97 137337
CONFUSION MATRIX
<AxesSubplot:>
pickle.dump(pipeline_ls,open('phishing.pkl','wb'))
loaded_model = pickle.load(open('phishing.pkl', 'rb'))
result = loaded_model.score(testX,testY)
print(result)
0.9651659785782418
*We get an accuracy of 96.5% on the test set
predict_bad = ['yeniik.com.tr/wp-admin/js/login.alibaba.com/login.jsp.php','fazan-pacir.rs/temp/libraries/ipad',
'tubemoviez.exe','svision-online.de/mgfi/administrator/components/com_babackup/classes/fx29id1.txt']
predict_good = ['youtube.com/','youtube.com/watch?v=qI0TQJI3vdU','retailhellunderground.com/','restorevisioncenters.com/html/technology.html']
loaded_model = pickle.load(open('phishing.pkl', 'rb'))
#predict_bad = vectorizers.transform(predict_bad)
# predict_good = vectorizer.transform(predict_good)
result = loaded_model.predict(predict_bad)
result2 = loaded_model.predict(predict_good)
print(result)
print("*"*30)
print(result2)
['bad' 'bad' 'bad' 'bad'] ****************************** ['good' 'good' 'good' 'good']